Load data and libraries
##################
# LOAD LIBRARIES #
##################
library(tidyverse)
library(Seurat)
library(tidyseurat)
library(cowplot)
library(harmony)
source("../bin/plotting_functions.R")
#########
# PATHS #
#########
input_dir <- "../results/01_QC_st_data/"
result_dir <- "../results/02_integrate_st_data/"
if( isFALSE(dir.exists(result_dir)) ) { dir.create(result_dir,recursive = TRUE) }
#############
# LODA DATA #
#############
DATA <- readRDS(paste0(input_dir,"seuratObj_filtered.RDS"))
Identify Highly Variable Genes (HVG) across samples
################################
# SPLIT INTO SEPERATE DATASETS #
################################
DATA_nested <- DATA %>%
mutate(batch = orig.ident) %>%
nest(data = -batch) %>%
mutate(data = imap(
data, ~ .x %>%
NormalizeData(., normalization.method = "LogNormalize",
verbose = FALSE) %>%
FindVariableFeatures(selection.method = "vst",
nfeatures = 2000,
verbose = FALSE) )) %>%
mutate(data = setNames(.[["data"]], .$batch))
#########################################
# FIND HIGLY VARIABLE GENES PER DATASET #
#########################################
hvgs_heat <- DATA_nested %>%
.$data %>%
map(., ~ .x@assays$RNA@var.features) %>%
( function(x){unique(unlist(x)) ->> hvgs_all; return(x)} ) %>%
# intersect across all samples:
( function(x){Reduce(intersect, x) ->> hvgs; return(x)} ) %>%
imap_dfc(., ~hvgs_all %in% .x, .id=.y) %>%
mutate(rownames = hvgs_all) %>%
column_to_rownames(var = "rownames")
# choose the hvg present in at least two samples:
hig_var <- rownames(hvgs_heat)[rowSums(hvgs_heat)>2]
# remove all VDJ-genes from list of HVG
remove <- str_subset(hig_var, "^IGH|^IGK|^IGL|^TRA|^TRB|^TRD|^TRG")
hig_var <- setdiff(hig_var, remove)
Heatmap of HVG in all samples
pheatmap::pheatmap(t(hvgs_heat * 1), cluster_rows = F, color = c("grey90", "grey20"))

Integration
############
# HARMONY #
###########
DATA <- DATA %>%
SCTransform(verbose = FALSE) %>%
FindVariableFeatures(selection.method = "vst",
nfeatures = 4000,
verbose = FALSE) %>%
ScaleData(verbose = FALSE, features = hig_var ) %>%
RunPCA(verbose = FALSE, npcs = 50) %>%
RunUMAP(dims = 1:50,
n.components = 2L,
n.neighbors = 10,
min.dist = .1,
spread = .3)
DATA <- DATA %>%
RunHarmony(group.by.vars = "orig.ident",
reduction.use = "pca",
dims.use = 1:50,
assay.use = "RNA") #%>%
DATA <- DATA %>%
RunUMAP(dims = 1:50,
n.neighbors = 10,
min.dist = .1,
spread = 1,
repulsion.strength = 1,
negative.sample.rate = 10,
n.epochs = 100,
reduction = "harmony",
reduction.name = "umapharmony")
Alternative graph based UMAP
integrated <- DATA@reductions$harmony@cell.embeddings
ann <- RcppHNSW::hnsw_build(as.matrix(integrated), distance = "cosine")
knn <- RcppHNSW::hnsw_search(as.matrix(integrated) , ann = ann , k = 15)
UU2 <- uwot::umap(X = NULL,
nn_method = knn,
n_components = 2,
ret_extra = c("model","fgraph"),
verbose = T,
min_dist = 0.1,
spread = .3,
repulsion_strength = 1,
negative_sample_rate = 10,
n_epochs = 150,
n_threads = 8)
dimnames(UU2$embedding) <- list(colnames(DATA),paste0("umap_harmony_knn_", 1:2))
DATA@reductions[["umap_harmony_knn"]] <- CreateDimReducObject(embeddings = UU2$embedding,
key = "umap_harmony_knn_")
colnames(DATA@reductions$umap_harmony_knn@cell.embeddings) <- paste0("umap_harmony_knn_", 1:2)
res <- c("umapharmony", "umap_harmony_knn")
p <- map(res, ~plot_clusters.fun(DATA, red=.x, cluster="orig.ident", lable=FALSE, txt_size = 7))
plot_grid(ncol = 2,
plotlist = p)

Plot before and after integration
# dev.new(height=6, width=6.6929133858, noRStudioGD = TRUE)
res <- c("PC", "harmony", "UMAP", "umapharmony")
title <- c("PCA raw data", "PCA Harmony integrated", "UMAP raw data", "UMAP Harmony integrated")
p <- map2(res, title,
~plot_clusters.fun(DATA,
cluster="orig.ident", txt_size = 9,
red=.x, lable=FALSE, title=.y))
plot_grid(ncol = 2,
plotlist = p)
